{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# CNKI数据挖掘"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 模块准备\n",
    "from selenium import webdriver\n",
    "from selenium.webdriver.common.desired_capabilities import DesiredCapabilities\n",
    "\n",
    "opts = webdriver.ChromeOptions()\n",
    "opts.add_argument('--no-sandbox')#解决DevToolsActivePort文件不存在的报错\n",
    "opts.add_argument('window-size=1920x3000') #指定浏览器分辨率\n",
    "opts.add_argument('--disable-gpu') #谷歌文档提到需要加上一这个属性来规避bug\n",
    "opts.add_argument('--hide-scrollbars')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from lxml.html import fromstring\n",
    "import time\n",
    "from random import random"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 打开CNKI\n",
    "* 1.校园网，自动登录，直接进行操作。\n",
    "* 2.连接校园网，进行登录后操作。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-3-642ae2f3684a>:2: DeprecationWarning: use options instead of chrome_options\n",
      "  driver = webdriver.Chrome( chrome_options = opts)\n"
     ]
    }
   ],
   "source": [
    "# 打开浏览器\n",
    "driver = webdriver.Chrome( chrome_options = opts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 输入CNKI网址\n",
    "driver.get(\"https://cnki.net\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 点击登录账号（如果自动登录可忽略）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击登录   没登自己登，有就跳过\n",
    "element=driver.find_element_by_xpath('/html/body/div[1]/div[1]/div/div/div/div[1]/div/div/div[4]/a')\n",
    "element.get_attribute('innerHTML')# 不要直接click（）等相关操作，要先通过xpath找到正确的element\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 输入账号密码\n",
    "payload={\"account\":\"1910350072\",\"password\":\"02286380\"}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 填写账号\n",
    "element=driver.find_element_by_xpath('/html/body/form/div[4]/div/div/div[3]/input')\n",
    "element.get_attribute('innerHTML')\n",
    "element.clear()\n",
    "element.send_keys(payload['account'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 填写密码\n",
    "element=driver.find_element_by_xpath('/html/body/form/div[4]/div/div/div[5]/input')\n",
    "element.get_attribute('innerHTML')\n",
    "element.clear()\n",
    "element.send_keys(payload['password'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击登录 \n",
    "element=driver.find_element_by_xpath('/html/body/form/div[4]/div/div/div[9]/a[2]')\n",
    "element.get_attribute('innerHTML')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'中山大学南方学院'"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 查看登录名\n",
    "element=driver.find_element_by_id('Ecp_loginShowName1')\n",
    "element.get_attribute('innerHTML')# 不要直接click（）等相关操作，要先通过xpath找到正确的element"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 打开高级检索"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "高级检索\n"
     ]
    }
   ],
   "source": [
    "# 点击高级检索\n",
    "element=driver.find_element_by_id('highSearch')\n",
    "print(element.get_attribute('innerHTML'))\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 查看窗口信息、跳转窗口"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'CDwindow-A52E504B4221D8947E9D3CE94C46D0D8'"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 当前窗口信息\n",
    "driver.current_window_handle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['CDwindow-A52E504B4221D8947E9D3CE94C46D0D8',\n",
       " 'CDwindow-FBF8B320C59D71E366FAAC2B32887BD2']"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## 查看所有窗口\n",
    "driver.window_handles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-9-b24a4b09c9a8>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[1])\n"
     ]
    }
   ],
   "source": [
    "# 跳转到第二个页面\n",
    "driver.switch_to_window(driver.window_handles[1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 选择学术期刊及专业检索"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点学术期刊\n",
    "element=driver.find_element_by_xpath('//ul[@class=\"doctype-menus keji\"]/li[@data-id=\"xsqk\"]/a')\n",
    "element.get_attribute('innerHTML')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点专业检索\n",
    "element=driver.find_element_by_name('majorSearch')\n",
    "element.get_attribute('innerHTML')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 选择期刊来源"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SCI  //div[@class=\"exthed-tit-labels\"]//input[@key=\"CSI\"]\n",
    "element=driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/div/div[1]/div[1]/div[2]/div[1]/div[3]/div/label[2]/input')\n",
    "element.get_attribute('innerHTML')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# EI\n",
    "element=driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/div/div[1]/div[1]/div[2]/div[1]/div[3]/div/label[3]/input')\n",
    "element.get_attribute('innerHTML')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 北大核心\n",
    "element=driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/div/div[1]/div[1]/div[2]/div[1]/div[3]/div/label[4]/input')\n",
    "element.get_attribute('innerHTML')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# CSSCI\n",
    "element=driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/div/div[1]/div[1]/div[2]/div[1]/div[3]/div/label[5]/input')\n",
    "element.get_attribute('innerHTML')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SCCD\n",
    "element=driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/div/div[1]/div[1]/div[2]/div[1]/div[3]/div/label[6]/input')\n",
    "element.get_attribute('innerHTML')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 输入检索内容"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据说明  \n",
    "* 对主题=“大数据”并且 篇名=“跨学科”或者 篇名=“传播” 或者 文章内容包括“新媒体” 并且 论文关键词=“融合” 或者 论文摘要=“趋势”并且文章内容=“媒介”的论文进行检索。总体围绕大数据下新媒体融合的发展趋势进行检索。  \n",
    "\n",
    "## 检索意义\n",
    "* 本人想要探究当代大数据主题下，关于跨学科或传播相关篇名的论文研究成果，进行查看及进一步的分析。同时想要了解大数据主题下，关于论文中包含新媒体，以“融合”为关键字的学术研究或摘要包含趋势同时文章内容包含媒介的相关学术研究。对最后筛选的结果进行论文查看和学习，从而提高本人对大数据背景下新媒体相关变化的敏感度。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 设置query\n",
    "# query='SU = \"传播\" AND (TI = \"跨学科\" OR TI =\"大数据\" OR TI =\"媒体\" AND TI =\"融合\")'\n",
    "query='SU = \"大数据\" AND (TI = \"跨学科\" OR TI=\"传播\" OR FT=\"新媒体\" AND  KY =\"融合\" OR AB=\"趋势\" AND FT=\"媒介\" )'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 输入检索要求\n",
    "element = driver.find_element_by_xpath('//textarea')\n",
    "element.clear()\n",
    "element.send_keys(query)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击检索\n",
    "element=driver.find_element_by_xpath('//input[@value=\"检索\"]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'914'"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 检索数量\n",
    "element=driver.find_element_by_xpath('//span[@class=\"pagerTitleCell\"]')\n",
    "element.get_attribute('innerHTML').split('<em>')[1].split('</em>')[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 页面显示50行及按发表时间正序排序"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击更换显示页数\n",
    "element = driver.find_element_by_xpath('//i[@class=\"icon icon-sort\"]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击50页，不行的话先运行上一步\n",
    "element =driver.find_element_by_xpath('//div[@id=\"perPageDiv\"]//li[@data-val=\"50\"]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 按发表时间正序排序\n",
    "element =driver.find_element_by_xpath('/html/body/div[3]/div[2]/div[2]/div[2]/form/div/div[1]/div[2]/div[3]/ul/li[2]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 获取第一页信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>篇名</th>\n",
       "      <th>作者</th>\n",
       "      <th>刊名</th>\n",
       "      <th>发表时间</th>\n",
       "      <th>被引</th>\n",
       "      <th>下载</th>\n",
       "      <th>操作</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>社会化媒体、移动终端、大数据:影响新闻生产的新技术因素</td>\n",
       "      <td>彭兰</td>\n",
       "      <td>新闻界</td>\n",
       "      <td>2012-08-20</td>\n",
       "      <td>447.0</td>\n",
       "      <td>13715</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>大数据带给图书馆的影响与挑战</td>\n",
       "      <td>韩翠峰</td>\n",
       "      <td>图书与情报</td>\n",
       "      <td>2012-10-15</td>\n",
       "      <td>328.0</td>\n",
       "      <td>9489</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>传统媒体中“大数据”的“微传播”</td>\n",
       "      <td>袁然</td>\n",
       "      <td>青年记者</td>\n",
       "      <td>2012-10-20</td>\n",
       "      <td>14.0</td>\n",
       "      <td>775</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>后媒体时代的新闻生产——2012新媒体年度盘点</td>\n",
       "      <td>栾轶玫</td>\n",
       "      <td>新闻与写作</td>\n",
       "      <td>2012-12-05</td>\n",
       "      <td>9.0</td>\n",
       "      <td>447</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>危机传播:无需逃避的现代性“陷阱”</td>\n",
       "      <td>王朋进</td>\n",
       "      <td>青年记者</td>\n",
       "      <td>2013-01-10</td>\n",
       "      <td>1.0</td>\n",
       "      <td>164</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6</td>\n",
       "      <td>迈向大媒介时代——2012年度我国媒介融合评述</td>\n",
       "      <td>谭天; 赵静雯; 苏慧</td>\n",
       "      <td>传媒</td>\n",
       "      <td>2013-01-15</td>\n",
       "      <td>7.0</td>\n",
       "      <td>1333</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>7</td>\n",
       "      <td>大数据时代传媒经济研究框架及工具的演化——2012年我国传媒经济研究文献综述</td>\n",
       "      <td>喻国明; 何睿</td>\n",
       "      <td>国际新闻界</td>\n",
       "      <td>2013-01-23</td>\n",
       "      <td>34.0</td>\n",
       "      <td>7576</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>8</td>\n",
       "      <td>微电影、大数据、三网融合:中国传媒业跨入新传播时代的门槛——社会视角下的2012中国传媒业关键词</td>\n",
       "      <td>喻国明; 宋美杰</td>\n",
       "      <td>编辑之友</td>\n",
       "      <td>2013-02-05</td>\n",
       "      <td>37.0</td>\n",
       "      <td>6280</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>9</td>\n",
       "      <td>大数据时代数字出版产业的发展趋势</td>\n",
       "      <td>孙玉玲</td>\n",
       "      <td>出版发行研究</td>\n",
       "      <td>2013-04-15</td>\n",
       "      <td>101.0</td>\n",
       "      <td>4544</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>10</td>\n",
       "      <td>我国报业发展走向:区域性、大数据、融合性</td>\n",
       "      <td>黄楚新</td>\n",
       "      <td>新闻与写作</td>\n",
       "      <td>2013-05-05</td>\n",
       "      <td>10.0</td>\n",
       "      <td>478</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>11</td>\n",
       "      <td>“大数据”时代计算机数据的财产化与刑法保护</td>\n",
       "      <td>于志刚</td>\n",
       "      <td>青海社会科学</td>\n",
       "      <td>2013-05-30</td>\n",
       "      <td>81.0</td>\n",
       "      <td>2691</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>12</td>\n",
       "      <td>危机传播管理体系的特点与作用</td>\n",
       "      <td>来向武</td>\n",
       "      <td>青年记者</td>\n",
       "      <td>2013-06-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>130</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>13</td>\n",
       "      <td>网络大数据:现状与展望</td>\n",
       "      <td>王元卓; 靳小龙; 程学旗</td>\n",
       "      <td>计算机学报</td>\n",
       "      <td>2013-06-15</td>\n",
       "      <td>1308.0</td>\n",
       "      <td>77686</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>14</td>\n",
       "      <td>大数据对我国政府信息公开立法修改的启示</td>\n",
       "      <td>张毅菁</td>\n",
       "      <td>图书情报工作</td>\n",
       "      <td>2013-06-15</td>\n",
       "      <td>46.0</td>\n",
       "      <td>1894</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>15</td>\n",
       "      <td>大数据时代教育的可能转向</td>\n",
       "      <td>喻长志</td>\n",
       "      <td>江淮论坛</td>\n",
       "      <td>2013-07-10</td>\n",
       "      <td>148.0</td>\n",
       "      <td>4322</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>16</td>\n",
       "      <td>壮大主流思想舆论网上传播最优路径探析</td>\n",
       "      <td>谭可可</td>\n",
       "      <td>学术论坛</td>\n",
       "      <td>2013-07-10</td>\n",
       "      <td>8.0</td>\n",
       "      <td>205</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>17</td>\n",
       "      <td>《小时代》票房飘红对大数据时代传播力建设的启示</td>\n",
       "      <td>张鑫</td>\n",
       "      <td>中国记者</td>\n",
       "      <td>2013-08-01</td>\n",
       "      <td>6.0</td>\n",
       "      <td>892</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>18</td>\n",
       "      <td>大数据助力社会科学研究:挑战与创新</td>\n",
       "      <td>沈浩; 黄晓兰</td>\n",
       "      <td>现代传播(中国传媒大学学报)</td>\n",
       "      <td>2013-08-15</td>\n",
       "      <td>113.0</td>\n",
       "      <td>4553</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>19</td>\n",
       "      <td>大数据时代的新闻与传播学教育:专业设置、学生技能、师资来源</td>\n",
       "      <td>祝建华</td>\n",
       "      <td>新闻大学</td>\n",
       "      <td>2013-08-15</td>\n",
       "      <td>67.0</td>\n",
       "      <td>2933</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>20</td>\n",
       "      <td>现阶段中国社会舆情的态势、热点与传播机制研究</td>\n",
       "      <td>喻国明</td>\n",
       "      <td>中国人民大学学报</td>\n",
       "      <td>2013-09-16</td>\n",
       "      <td>11.0</td>\n",
       "      <td>1257</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>21</td>\n",
       "      <td>社会信息化发展的新趋势与产业变革</td>\n",
       "      <td>王世伟</td>\n",
       "      <td>情报资料工作</td>\n",
       "      <td>2013-09-25</td>\n",
       "      <td>7.0</td>\n",
       "      <td>365</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>22</td>\n",
       "      <td>大数据与信息化教学变革</td>\n",
       "      <td>金陵</td>\n",
       "      <td>中国电化教育</td>\n",
       "      <td>2013-10-10</td>\n",
       "      <td>345.0</td>\n",
       "      <td>11957</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>23</td>\n",
       "      <td>关注传播业变局下报纸从业者的“个体转型”</td>\n",
       "      <td>向熹</td>\n",
       "      <td>新闻界</td>\n",
       "      <td>2013-10-25</td>\n",
       "      <td>5.0</td>\n",
       "      <td>82</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>24</td>\n",
       "      <td>数字传播时代RTB(实时竞价)广告模式研究</td>\n",
       "      <td>周楚莉</td>\n",
       "      <td>中国记者</td>\n",
       "      <td>2013-11-01</td>\n",
       "      <td>19.0</td>\n",
       "      <td>882</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>25</td>\n",
       "      <td>全媒体信息整合与传播——中外电视媒体的演进与操作路径研究</td>\n",
       "      <td>付晓光; 曾祥敏</td>\n",
       "      <td>现代传播(中国传媒大学学报)</td>\n",
       "      <td>2013-11-15</td>\n",
       "      <td>4.0</td>\n",
       "      <td>785</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>26</td>\n",
       "      <td>投放精准及理念转型——大数据时代互联网广告的传播逻辑重构</td>\n",
       "      <td>张辉锋; 金韶</td>\n",
       "      <td>当代传播</td>\n",
       "      <td>2013-11-15</td>\n",
       "      <td>67.0</td>\n",
       "      <td>3125</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>27</td>\n",
       "      <td>社会计算:大数据时代的机遇与挑战</td>\n",
       "      <td>孟小峰; 李勇; 祝建华</td>\n",
       "      <td>计算机研究与发展</td>\n",
       "      <td>2013-11-30 23:04</td>\n",
       "      <td>200.0</td>\n",
       "      <td>7658</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>28</td>\n",
       "      <td>大数据时代的电视媒体覆盖与传播——2013美兰德媒体传播通路与受众研究创新调研成果在京发布</td>\n",
       "      <td>吴彦华</td>\n",
       "      <td>当代电视</td>\n",
       "      <td>2013-12-01</td>\n",
       "      <td>4.0</td>\n",
       "      <td>288</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>29</td>\n",
       "      <td>大数据时代网络舆情传播形态与引导战略</td>\n",
       "      <td>尹亚辉</td>\n",
       "      <td>新闻知识</td>\n",
       "      <td>2013-12-15</td>\n",
       "      <td>33.0</td>\n",
       "      <td>1077</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>30</td>\n",
       "      <td>2013年出版产业十大关键词解读</td>\n",
       "      <td>肖东发; 卞卓舟</td>\n",
       "      <td>编辑之友</td>\n",
       "      <td>2014-01-05</td>\n",
       "      <td>8.0</td>\n",
       "      <td>718</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>31</td>\n",
       "      <td>社交媒体时代的网络舆情——生态变化及舆情研究现状、趋势</td>\n",
       "      <td>李彪; 郑满宁</td>\n",
       "      <td>新闻记者</td>\n",
       "      <td>2014-01-05</td>\n",
       "      <td>92.0</td>\n",
       "      <td>5366</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>32</td>\n",
       "      <td>大数据时代的电影营销</td>\n",
       "      <td>刘婧雅; 文田</td>\n",
       "      <td>电影艺术</td>\n",
       "      <td>2014-01-05</td>\n",
       "      <td>37.0</td>\n",
       "      <td>3942</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>33</td>\n",
       "      <td>大数据时代面向全媒体的电视剧传播</td>\n",
       "      <td>薛继军</td>\n",
       "      <td>电视研究</td>\n",
       "      <td>2014-01-05</td>\n",
       "      <td>10.0</td>\n",
       "      <td>677</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>34</td>\n",
       "      <td>竞合、聚合、整合——2013媒介融合盘点</td>\n",
       "      <td>谭天; 杨伟龙</td>\n",
       "      <td>传媒</td>\n",
       "      <td>2014-01-08</td>\n",
       "      <td>7.0</td>\n",
       "      <td>996</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>35</td>\n",
       "      <td>首届中国教育信息化行业新年论坛召开</td>\n",
       "      <td>马小强</td>\n",
       "      <td>中国电化教育</td>\n",
       "      <td>2014-01-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>603</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>36</td>\n",
       "      <td>基于互联网统一平台的中国信息消费研究</td>\n",
       "      <td>王兴全</td>\n",
       "      <td>社会科学</td>\n",
       "      <td>2014-01-10</td>\n",
       "      <td>15.0</td>\n",
       "      <td>956</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>37</td>\n",
       "      <td>品牌、自媒体与大数据的未来——当代传播语境下川北灯戏发展策略论析</td>\n",
       "      <td>岳莹</td>\n",
       "      <td>宁夏社会科学</td>\n",
       "      <td>2014-01-15</td>\n",
       "      <td>7.0</td>\n",
       "      <td>454</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>38</td>\n",
       "      <td>大数据时代的传播观念变革</td>\n",
       "      <td>倪宁</td>\n",
       "      <td>西北大学学报(哲学社会科学版)</td>\n",
       "      <td>2014-01-15</td>\n",
       "      <td>55.0</td>\n",
       "      <td>1718</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>39</td>\n",
       "      <td>大数据时代传播研究中语料库分析方法的价值</td>\n",
       "      <td>喻国明; 李慧娟</td>\n",
       "      <td>传媒</td>\n",
       "      <td>2014-01-23</td>\n",
       "      <td>14.0</td>\n",
       "      <td>1311</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39</th>\n",
       "      <td>40</td>\n",
       "      <td>重压之下中国传媒经济研究的主题:2013年传媒经济研究文献综述</td>\n",
       "      <td>喻国明; 何睿</td>\n",
       "      <td>国际新闻界</td>\n",
       "      <td>2014-01-23</td>\n",
       "      <td>9.0</td>\n",
       "      <td>2295</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40</th>\n",
       "      <td>41</td>\n",
       "      <td>2013年中国新媒体传播研究综述</td>\n",
       "      <td>付玉辉</td>\n",
       "      <td>国际新闻界</td>\n",
       "      <td>2014-01-23</td>\n",
       "      <td>8.0</td>\n",
       "      <td>2474</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41</th>\n",
       "      <td>42</td>\n",
       "      <td>大数据时代的精准广告及其传播策略——基于场域理论视角</td>\n",
       "      <td>倪宁; 金韶</td>\n",
       "      <td>现代传播(中国传媒大学学报)</td>\n",
       "      <td>2014-02-15</td>\n",
       "      <td>188.0</td>\n",
       "      <td>10562</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42</th>\n",
       "      <td>43</td>\n",
       "      <td>4G时代:电视生产传播新模式</td>\n",
       "      <td>杨振荣; 杨咏</td>\n",
       "      <td>新闻战线</td>\n",
       "      <td>2014-02-15</td>\n",
       "      <td>3.0</td>\n",
       "      <td>98</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>43</th>\n",
       "      <td>44</td>\n",
       "      <td>新媒体语境下的电视生产与传播机制创新</td>\n",
       "      <td>赵肖雄</td>\n",
       "      <td>中国电视</td>\n",
       "      <td>2014-02-15</td>\n",
       "      <td>3.0</td>\n",
       "      <td>324</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44</th>\n",
       "      <td>45</td>\n",
       "      <td>大数据时代的传播特征</td>\n",
       "      <td>周子渊</td>\n",
       "      <td>青年记者</td>\n",
       "      <td>2014-02-20</td>\n",
       "      <td>20.0</td>\n",
       "      <td>518</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45</th>\n",
       "      <td>46</td>\n",
       "      <td>大数据视域下的科技期刊数据库建设</td>\n",
       "      <td>刘俊; 张昕</td>\n",
       "      <td>编辑学报</td>\n",
       "      <td>2014-02-25</td>\n",
       "      <td>32.0</td>\n",
       "      <td>996</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>46</th>\n",
       "      <td>47</td>\n",
       "      <td>2013年中国传媒业的发展状况与未来态势——基于智能化社会文本分析技术视野中的中国传媒业</td>\n",
       "      <td>喻国明; 杨雅</td>\n",
       "      <td>社会科学战线</td>\n",
       "      <td>2014-03-01</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1128</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47</th>\n",
       "      <td>48</td>\n",
       "      <td>媒介融合背景下新闻出版业发展趋势</td>\n",
       "      <td>杜建华</td>\n",
       "      <td>编辑之友</td>\n",
       "      <td>2014-03-05</td>\n",
       "      <td>6.0</td>\n",
       "      <td>454</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48</th>\n",
       "      <td>49</td>\n",
       "      <td>IT第三平台对开放教育资源建设的影响</td>\n",
       "      <td>梁小庆</td>\n",
       "      <td>中国远程教育</td>\n",
       "      <td>2014-03-06</td>\n",
       "      <td>3.0</td>\n",
       "      <td>240</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49</th>\n",
       "      <td>50</td>\n",
       "      <td>数据新闻:大数据时代新闻可视化传播的创新路径</td>\n",
       "      <td>郎劲松; 杨海</td>\n",
       "      <td>现代传播(中国传媒大学学报)</td>\n",
       "      <td>2014-03-15</td>\n",
       "      <td>257.0</td>\n",
       "      <td>12605</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Unnamed: 0                                                篇名  \\\n",
       "0            1                       社会化媒体、移动终端、大数据:影响新闻生产的新技术因素   \n",
       "1            2                                    大数据带给图书馆的影响与挑战   \n",
       "2            3                                  传统媒体中“大数据”的“微传播”   \n",
       "3            4                           后媒体时代的新闻生产——2012新媒体年度盘点   \n",
       "4            5                                 危机传播:无需逃避的现代性“陷阱”   \n",
       "5            6                           迈向大媒介时代——2012年度我国媒介融合评述   \n",
       "6            7            大数据时代传媒经济研究框架及工具的演化——2012年我国传媒经济研究文献综述   \n",
       "7            8  微电影、大数据、三网融合:中国传媒业跨入新传播时代的门槛——社会视角下的2012中国传媒业关键词   \n",
       "8            9                                  大数据时代数字出版产业的发展趋势   \n",
       "9           10                              我国报业发展走向:区域性、大数据、融合性   \n",
       "10          11                             “大数据”时代计算机数据的财产化与刑法保护   \n",
       "11          12                                    危机传播管理体系的特点与作用   \n",
       "12          13                                       网络大数据:现状与展望   \n",
       "13          14                               大数据对我国政府信息公开立法修改的启示   \n",
       "14          15                                      大数据时代教育的可能转向   \n",
       "15          16                                壮大主流思想舆论网上传播最优路径探析   \n",
       "16          17                           《小时代》票房飘红对大数据时代传播力建设的启示   \n",
       "17          18                                 大数据助力社会科学研究:挑战与创新   \n",
       "18          19                     大数据时代的新闻与传播学教育:专业设置、学生技能、师资来源   \n",
       "19          20                            现阶段中国社会舆情的态势、热点与传播机制研究   \n",
       "20          21                                  社会信息化发展的新趋势与产业变革   \n",
       "21          22                                       大数据与信息化教学变革   \n",
       "22          23                              关注传播业变局下报纸从业者的“个体转型”   \n",
       "23          24                             数字传播时代RTB(实时竞价)广告模式研究   \n",
       "24          25                      全媒体信息整合与传播——中外电视媒体的演进与操作路径研究   \n",
       "25          26                      投放精准及理念转型——大数据时代互联网广告的传播逻辑重构   \n",
       "26          27                                  社会计算:大数据时代的机遇与挑战   \n",
       "27          28     大数据时代的电视媒体覆盖与传播——2013美兰德媒体传播通路与受众研究创新调研成果在京发布   \n",
       "28          29                                大数据时代网络舆情传播形态与引导战略   \n",
       "29          30                                  2013年出版产业十大关键词解读   \n",
       "30          31                       社交媒体时代的网络舆情——生态变化及舆情研究现状、趋势   \n",
       "31          32                                        大数据时代的电影营销   \n",
       "32          33                                  大数据时代面向全媒体的电视剧传播   \n",
       "33          34                              竞合、聚合、整合——2013媒介融合盘点   \n",
       "34          35                                 首届中国教育信息化行业新年论坛召开   \n",
       "35          36                                基于互联网统一平台的中国信息消费研究   \n",
       "36          37                  品牌、自媒体与大数据的未来——当代传播语境下川北灯戏发展策略论析   \n",
       "37          38                                      大数据时代的传播观念变革   \n",
       "38          39                              大数据时代传播研究中语料库分析方法的价值   \n",
       "39          40                   重压之下中国传媒经济研究的主题:2013年传媒经济研究文献综述   \n",
       "40          41                                  2013年中国新媒体传播研究综述   \n",
       "41          42                        大数据时代的精准广告及其传播策略——基于场域理论视角   \n",
       "42          43                                    4G时代:电视生产传播新模式   \n",
       "43          44                                新媒体语境下的电视生产与传播机制创新   \n",
       "44          45                                        大数据时代的传播特征   \n",
       "45          46                                  大数据视域下的科技期刊数据库建设   \n",
       "46          47      2013年中国传媒业的发展状况与未来态势——基于智能化社会文本分析技术视野中的中国传媒业   \n",
       "47          48                                  媒介融合背景下新闻出版业发展趋势   \n",
       "48          49                                IT第三平台对开放教育资源建设的影响   \n",
       "49          50                            数据新闻:大数据时代新闻可视化传播的创新路径   \n",
       "\n",
       "               作者               刊名              发表时间      被引     下载  操作  \n",
       "0              彭兰              新闻界        2012-08-20   447.0  13715  下载  \n",
       "1             韩翠峰            图书与情报        2012-10-15   328.0   9489  下载  \n",
       "2              袁然             青年记者        2012-10-20    14.0    775  下载  \n",
       "3             栾轶玫            新闻与写作        2012-12-05     9.0    447  下载  \n",
       "4             王朋进             青年记者        2013-01-10     1.0    164  下载  \n",
       "5     谭天; 赵静雯; 苏慧               传媒        2013-01-15     7.0   1333  下载  \n",
       "6         喻国明; 何睿            国际新闻界        2013-01-23    34.0   7576  下载  \n",
       "7        喻国明; 宋美杰             编辑之友        2013-02-05    37.0   6280  下载  \n",
       "8             孙玉玲           出版发行研究        2013-04-15   101.0   4544  下载  \n",
       "9             黄楚新            新闻与写作        2013-05-05    10.0    478  下载  \n",
       "10            于志刚           青海社会科学        2013-05-30    81.0   2691  下载  \n",
       "11            来向武             青年记者        2013-06-10     NaN    130  下载  \n",
       "12  王元卓; 靳小龙; 程学旗            计算机学报        2013-06-15  1308.0  77686  下载  \n",
       "13            张毅菁           图书情报工作        2013-06-15    46.0   1894  下载  \n",
       "14            喻长志             江淮论坛        2013-07-10   148.0   4322  下载  \n",
       "15            谭可可             学术论坛        2013-07-10     8.0    205  下载  \n",
       "16             张鑫             中国记者        2013-08-01     6.0    892  下载  \n",
       "17        沈浩; 黄晓兰   现代传播(中国传媒大学学报)        2013-08-15   113.0   4553  下载  \n",
       "18            祝建华             新闻大学        2013-08-15    67.0   2933  下载  \n",
       "19            喻国明         中国人民大学学报        2013-09-16    11.0   1257  下载  \n",
       "20            王世伟           情报资料工作        2013-09-25     7.0    365  下载  \n",
       "21             金陵           中国电化教育        2013-10-10   345.0  11957  下载  \n",
       "22             向熹              新闻界        2013-10-25     5.0     82  下载  \n",
       "23            周楚莉             中国记者        2013-11-01    19.0    882  下载  \n",
       "24       付晓光; 曾祥敏   现代传播(中国传媒大学学报)        2013-11-15     4.0    785  下载  \n",
       "25        张辉锋; 金韶             当代传播        2013-11-15    67.0   3125  下载  \n",
       "26   孟小峰; 李勇; 祝建华         计算机研究与发展  2013-11-30 23:04   200.0   7658  下载  \n",
       "27            吴彦华             当代电视        2013-12-01     4.0    288  下载  \n",
       "28            尹亚辉             新闻知识        2013-12-15    33.0   1077  下载  \n",
       "29       肖东发; 卞卓舟             编辑之友        2014-01-05     8.0    718  下载  \n",
       "30        李彪; 郑满宁             新闻记者        2014-01-05    92.0   5366  下载  \n",
       "31        刘婧雅; 文田             电影艺术        2014-01-05    37.0   3942  下载  \n",
       "32            薛继军             电视研究        2014-01-05    10.0    677  下载  \n",
       "33        谭天; 杨伟龙               传媒        2014-01-08     7.0    996  下载  \n",
       "34            马小强           中国电化教育        2014-01-10     NaN    603  下载  \n",
       "35            王兴全             社会科学        2014-01-10    15.0    956  下载  \n",
       "36             岳莹           宁夏社会科学        2014-01-15     7.0    454  下载  \n",
       "37             倪宁  西北大学学报(哲学社会科学版)        2014-01-15    55.0   1718  下载  \n",
       "38       喻国明; 李慧娟               传媒        2014-01-23    14.0   1311  下载  \n",
       "39        喻国明; 何睿            国际新闻界        2014-01-23     9.0   2295  下载  \n",
       "40            付玉辉            国际新闻界        2014-01-23     8.0   2474  下载  \n",
       "41         倪宁; 金韶   现代传播(中国传媒大学学报)        2014-02-15   188.0  10562  下载  \n",
       "42        杨振荣; 杨咏             新闻战线        2014-02-15     3.0     98  下载  \n",
       "43            赵肖雄             中国电视        2014-02-15     3.0    324  下载  \n",
       "44            周子渊             青年记者        2014-02-20    20.0    518  下载  \n",
       "45         刘俊; 张昕             编辑学报        2014-02-25    32.0    996  下载  \n",
       "46        喻国明; 杨雅           社会科学战线        2014-03-01     3.0   1128  下载  \n",
       "47            杜建华             编辑之友        2014-03-05     6.0    454  下载  \n",
       "48            梁小庆           中国远程教育        2014-03-06     3.0    240  下载  \n",
       "49        郎劲松; 杨海   现代传播(中国传媒大学学报)        2014-03-15   257.0  12605  下载  "
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 获取页面信息   可以先不运行\n",
    "element = driver.find_element_by_id('gridTable')\n",
    "首页 = element.get_attribute('innerHTML')\n",
    "首页_表格=pd.read_html(首页)[0]\n",
    "首页_表格"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 翻页"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'下一页'"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 翻页\n",
    "element = driver.find_element_by_id('PageNext')\n",
    "element.get_attribute('innerHTML')\n",
    "# element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# 刷新\n",
    "# driver.refresh()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 建立翻页函数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "表格_html = dict()\n",
    "main_content =\"\"\n",
    "element = None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]\n"
     ]
    }
   ],
   "source": [
    "# 循环页面\n",
    "pages = list(range(1,19))\n",
    "print(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_pages (pages):\n",
    "    for p in pages:\n",
    "        print (p,end='\\t')\n",
    "        跳转 = driver.find_element_by_id('PageNext')\n",
    "        跳转.click()\n",
    "        time.sleep(30+20*random())\n",
    "        element = driver.find_element_by_id('gridTable')\n",
    "        main_content = element.get_attribute('innerHTML')\n",
    "        表格_html[p] = main_content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\t2\t3\t4\t5\t6\t7\t8\t9\t10\t11\t12\t13\t14\t15\t16\t17\t18\t"
     ]
    }
   ],
   "source": [
    "process_pages(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>html_snippets</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                        html_snippets\n",
       "1   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "2   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "3   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "4   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "5   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "6   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "7   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "8   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "9   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "10  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "11  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "12  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "13  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "14  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "15  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "16  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "17  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "18  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ..."
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "df = pd.DataFrame([表格_html]).T\n",
    "df.columns = [\"html_snippets\"]\n",
    "display(df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 存储"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 存储网页tsv文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "网站=\"知网\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "fn = { \"output\" : { \"htm_snippets\": \"data_raw_src/知网_htm_snippets_{网站}.tsv\"}}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = fn [\"output\"] [\"htm_snippets\"] \n",
    "df.to_csv(filename.format(网站=网站), sep=\"\\t\", encoding=\"utf8\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "l_df = []\n",
    "for p in pages:\n",
    "    表格=pd.read_html(表格_html[p])[0]\n",
    "    l_df.append(表格)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>篇名</th>\n",
       "      <th>作者</th>\n",
       "      <th>刊名</th>\n",
       "      <th>发表时间</th>\n",
       "      <th>被引</th>\n",
       "      <th>下载</th>\n",
       "      <th>操作</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>社会化媒体、移动终端、大数据:影响新闻生产的新技术因素</td>\n",
       "      <td>彭兰</td>\n",
       "      <td>新闻界</td>\n",
       "      <td>2012-08-20</td>\n",
       "      <td>447.0</td>\n",
       "      <td>13715.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>大数据带给图书馆的影响与挑战</td>\n",
       "      <td>韩翠峰</td>\n",
       "      <td>图书与情报</td>\n",
       "      <td>2012-10-15</td>\n",
       "      <td>328.0</td>\n",
       "      <td>9489.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>传统媒体中“大数据”的“微传播”</td>\n",
       "      <td>袁然</td>\n",
       "      <td>青年记者</td>\n",
       "      <td>2012-10-20</td>\n",
       "      <td>14.0</td>\n",
       "      <td>775.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>后媒体时代的新闻生产——2012新媒体年度盘点</td>\n",
       "      <td>栾轶玫</td>\n",
       "      <td>新闻与写作</td>\n",
       "      <td>2012-12-05</td>\n",
       "      <td>9.0</td>\n",
       "      <td>447.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>危机传播:无需逃避的现代性“陷阱”</td>\n",
       "      <td>王朋进</td>\n",
       "      <td>青年记者</td>\n",
       "      <td>2013-01-10</td>\n",
       "      <td>1.0</td>\n",
       "      <td>164.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>859</th>\n",
       "      <td>910</td>\n",
       "      <td>多维视角下的公共管理决策方法与技术——评《公共管理方法与技术》</td>\n",
       "      <td>麻晓宏</td>\n",
       "      <td>热带作物学报</td>\n",
       "      <td>2021-05-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>860</th>\n",
       "      <td>911</td>\n",
       "      <td>新技术赋能市场监管智能化：图景、障碍与进路</td>\n",
       "      <td>王湘军; 庞尚尚</td>\n",
       "      <td>行政论坛</td>\n",
       "      <td>2021-05-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>24.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>861</th>\n",
       "      <td>912</td>\n",
       "      <td>智能阅读：新时代阅读的新趋势</td>\n",
       "      <td>方卿; 王欣月; 王嘉昀</td>\n",
       "      <td>科技与出版</td>\n",
       "      <td>2021-06-02 08:55</td>\n",
       "      <td>NaN</td>\n",
       "      <td>49.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>862</th>\n",
       "      <td>913</td>\n",
       "      <td>互联网新媒体传播中农村职业培训脱贫致富基本内涵的逻辑生成</td>\n",
       "      <td>杨宗晓; 杨克</td>\n",
       "      <td>农业经济</td>\n",
       "      <td>2021-06-04</td>\n",
       "      <td>NaN</td>\n",
       "      <td>82.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>863</th>\n",
       "      <td>914</td>\n",
       "      <td>刍议基于社交媒体数据的电视节目评价体系</td>\n",
       "      <td>马绪峰; 赵鑫磊; 宋凯</td>\n",
       "      <td>中国电视</td>\n",
       "      <td>2021-06-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>914 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     Unnamed: 0                               篇名            作者      刊名  \\\n",
       "0             1      社会化媒体、移动终端、大数据:影响新闻生产的新技术因素            彭兰     新闻界   \n",
       "1             2                   大数据带给图书馆的影响与挑战           韩翠峰   图书与情报   \n",
       "2             3                 传统媒体中“大数据”的“微传播”            袁然    青年记者   \n",
       "3             4          后媒体时代的新闻生产——2012新媒体年度盘点           栾轶玫   新闻与写作   \n",
       "4             5                危机传播:无需逃避的现代性“陷阱”           王朋进    青年记者   \n",
       "..          ...                              ...           ...     ...   \n",
       "859         910  多维视角下的公共管理决策方法与技术——评《公共管理方法与技术》           麻晓宏  热带作物学报   \n",
       "860         911            新技术赋能市场监管智能化：图景、障碍与进路      王湘军; 庞尚尚    行政论坛   \n",
       "861         912                   智能阅读：新时代阅读的新趋势  方卿; 王欣月; 王嘉昀   科技与出版   \n",
       "862         913     互联网新媒体传播中农村职业培训脱贫致富基本内涵的逻辑生成       杨宗晓; 杨克    农业经济   \n",
       "863         914              刍议基于社交媒体数据的电视节目评价体系  马绪峰; 赵鑫磊; 宋凯    中国电视   \n",
       "\n",
       "                 发表时间     被引       下载  操作  \n",
       "0          2012-08-20  447.0  13715.0  下载  \n",
       "1          2012-10-15  328.0   9489.0  下载  \n",
       "2          2012-10-20   14.0    775.0  下载  \n",
       "3          2012-12-05    9.0    447.0  下载  \n",
       "4          2013-01-10    1.0    164.0  下载  \n",
       "..                ...    ...      ...  ..  \n",
       "859        2021-05-25    NaN      2.0  下载  \n",
       "860        2021-05-25    NaN     24.0  下载  \n",
       "861  2021-06-02 08:55    NaN     49.0  下载  \n",
       "862        2021-06-04    NaN     82.0  下载  \n",
       "863        2021-06-15    NaN      NaN  下载  \n",
       "\n",
       "[914 rows x 8 columns]"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out = pd.concat(l_df).reset_index(drop=True)\n",
    "df_总表=首页_表格.append(df_url_out)\n",
    "df_总表"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 存所有翻页的所有文章"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>篇名</th>\n",
       "      <th>作者</th>\n",
       "      <th>刊名</th>\n",
       "      <th>发表时间</th>\n",
       "      <th>被引</th>\n",
       "      <th>下载</th>\n",
       "      <th>操作</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>社会化媒体、移动终端、大数据:影响新闻生产的新技术因素</td>\n",
       "      <td>彭兰</td>\n",
       "      <td>新闻界</td>\n",
       "      <td>2012-08-20</td>\n",
       "      <td>447.0</td>\n",
       "      <td>13715.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>大数据带给图书馆的影响与挑战</td>\n",
       "      <td>韩翠峰</td>\n",
       "      <td>图书与情报</td>\n",
       "      <td>2012-10-15</td>\n",
       "      <td>328.0</td>\n",
       "      <td>9489.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>传统媒体中“大数据”的“微传播”</td>\n",
       "      <td>袁然</td>\n",
       "      <td>青年记者</td>\n",
       "      <td>2012-10-20</td>\n",
       "      <td>14.0</td>\n",
       "      <td>775.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>后媒体时代的新闻生产——2012新媒体年度盘点</td>\n",
       "      <td>栾轶玫</td>\n",
       "      <td>新闻与写作</td>\n",
       "      <td>2012-12-05</td>\n",
       "      <td>9.0</td>\n",
       "      <td>447.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>危机传播:无需逃避的现代性“陷阱”</td>\n",
       "      <td>王朋进</td>\n",
       "      <td>青年记者</td>\n",
       "      <td>2013-01-10</td>\n",
       "      <td>1.0</td>\n",
       "      <td>164.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>859</th>\n",
       "      <td>910</td>\n",
       "      <td>多维视角下的公共管理决策方法与技术——评《公共管理方法与技术》</td>\n",
       "      <td>麻晓宏</td>\n",
       "      <td>热带作物学报</td>\n",
       "      <td>2021-05-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>860</th>\n",
       "      <td>911</td>\n",
       "      <td>新技术赋能市场监管智能化：图景、障碍与进路</td>\n",
       "      <td>王湘军; 庞尚尚</td>\n",
       "      <td>行政论坛</td>\n",
       "      <td>2021-05-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>24.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>861</th>\n",
       "      <td>912</td>\n",
       "      <td>智能阅读：新时代阅读的新趋势</td>\n",
       "      <td>方卿; 王欣月; 王嘉昀</td>\n",
       "      <td>科技与出版</td>\n",
       "      <td>2021-06-02 08:55</td>\n",
       "      <td>NaN</td>\n",
       "      <td>49.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>862</th>\n",
       "      <td>913</td>\n",
       "      <td>互联网新媒体传播中农村职业培训脱贫致富基本内涵的逻辑生成</td>\n",
       "      <td>杨宗晓; 杨克</td>\n",
       "      <td>农业经济</td>\n",
       "      <td>2021-06-04</td>\n",
       "      <td>NaN</td>\n",
       "      <td>82.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>863</th>\n",
       "      <td>914</td>\n",
       "      <td>刍议基于社交媒体数据的电视节目评价体系</td>\n",
       "      <td>马绪峰; 赵鑫磊; 宋凯</td>\n",
       "      <td>中国电视</td>\n",
       "      <td>2021-06-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>914 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     Unnamed: 0                               篇名            作者      刊名  \\\n",
       "0             1      社会化媒体、移动终端、大数据:影响新闻生产的新技术因素            彭兰     新闻界   \n",
       "1             2                   大数据带给图书馆的影响与挑战           韩翠峰   图书与情报   \n",
       "2             3                 传统媒体中“大数据”的“微传播”            袁然    青年记者   \n",
       "3             4          后媒体时代的新闻生产——2012新媒体年度盘点           栾轶玫   新闻与写作   \n",
       "4             5                危机传播:无需逃避的现代性“陷阱”           王朋进    青年记者   \n",
       "..          ...                              ...           ...     ...   \n",
       "859         910  多维视角下的公共管理决策方法与技术——评《公共管理方法与技术》           麻晓宏  热带作物学报   \n",
       "860         911            新技术赋能市场监管智能化：图景、障碍与进路      王湘军; 庞尚尚    行政论坛   \n",
       "861         912                   智能阅读：新时代阅读的新趋势  方卿; 王欣月; 王嘉昀   科技与出版   \n",
       "862         913     互联网新媒体传播中农村职业培训脱贫致富基本内涵的逻辑生成       杨宗晓; 杨克    农业经济   \n",
       "863         914              刍议基于社交媒体数据的电视节目评价体系  马绪峰; 赵鑫磊; 宋凯    中国电视   \n",
       "\n",
       "                 发表时间     被引       下载  操作  \n",
       "0          2012-08-20  447.0  13715.0  下载  \n",
       "1          2012-10-15  328.0   9489.0  下载  \n",
       "2          2012-10-20   14.0    775.0  下载  \n",
       "3          2012-12-05    9.0    447.0  下载  \n",
       "4          2013-01-10    1.0    164.0  下载  \n",
       "..                ...    ...      ...  ..  \n",
       "859        2021-05-25    NaN      2.0  下载  \n",
       "860        2021-05-25    NaN     24.0  下载  \n",
       "861  2021-06-02 08:55    NaN     49.0  下载  \n",
       "862        2021-06-04    NaN     82.0  下载  \n",
       "863        2021-06-15    NaN      NaN  下载  \n",
       "\n",
       "[914 rows x 8 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 存表格\n",
    "with pd.ExcelWriter('知网检索数据.xlsx',mode='w',engine=\"openpyxl\") as writer:  \n",
    "            df_总表.to_excel(writer,sheet_name=\"知网数据\")\n",
    "display(df_总表)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 导出refworks文件（.txt）  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 回到首页\n",
    "element =driver.find_element_by_xpath('/html/body/div[3]/div[2]/div[2]/div[2]/form/div/div[2]/a[1]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "全选 = driver.find_element_by_xpath('/html/body/div[3]/div[2]/div[2]/div[2]/form/div/div[1]/div[2]/div[1]/label/input')\n",
    "全选.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# 清除已选数据\n",
    "element =driver.find_element_by_xpath('/html/body/div[3]/div[2]/div[2]/div[2]/form/div/div[1]/div[2]/div[1]/a')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 导出refworks文件和批量下载前500篇文章"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]\n",
      "[10, 11, 12, 13, 14, 15, 16, 17, 18]\n"
     ]
    }
   ],
   "source": [
    "# 页数\n",
    "pages1 = list(range(0,11))\n",
    "pages2 =list(range(10,19))\n",
    "print(pages1)\n",
    "print(pages2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "def selectCheckbox (pages1):\n",
    "    for p in pages1:\n",
    "        print (p,end='\\t')\n",
    "        跳转 = driver.find_element_by_id('PageNext')\n",
    "        跳转.click()\n",
    "        time.sleep(10+5*random())\n",
    "        全选 = driver.find_element_by_xpath('/html/body/div[3]/div[2]/div[2]/div[2]/form/div/div[1]/div[2]/div[1]/label/input')\n",
    "        全选.click()\n",
    "        time.sleep(5+5*random())        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {
    "collapsed": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\t2\t3\t4\t5\t6\t7\t8\t9\t10\t11\t12\t"
     ]
    },
    {
     "ename": "UnexpectedAlertPresentException",
     "evalue": "Alert Text: 不能超过500个,若要重新选择,请按清除按钮,再进行选取操作\nMessage: unexpected alert open: {Alert text : 不能超过500个,若要重新选择,请按清除按钮,再进行选取操作}\n  (Session info: chrome=91.0.4472.106)\n",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mUnexpectedAlertPresentException\u001b[0m           Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-51-1d8d2a636c4c>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mselectCheckbox\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mpages1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;32m<ipython-input-50-ab1a27621858>\u001b[0m in \u001b[0;36mselectCheckbox\u001b[1;34m(pages1)\u001b[0m\n\u001b[0;32m      2\u001b[0m     \u001b[1;32mfor\u001b[0m \u001b[0mp\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mpages\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      3\u001b[0m         \u001b[0mprint\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mp\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mend\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'\\t'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 4\u001b[1;33m         \u001b[0m跳转\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdriver\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind_element_by_id\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'PageNext'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      5\u001b[0m         \u001b[0m跳转\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclick\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      6\u001b[0m         \u001b[0mtime\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msleep\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m10\u001b[0m\u001b[1;33m+\u001b[0m\u001b[1;36m5\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0mrandom\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\anaconda\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py\u001b[0m in \u001b[0;36mfind_element_by_id\u001b[1;34m(self, id_)\u001b[0m\n\u001b[0;32m    358\u001b[0m             \u001b[0melement\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdriver\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind_element_by_id\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'foo'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    359\u001b[0m         \"\"\"\n\u001b[1;32m--> 360\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind_element\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mby\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mBy\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mID\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mid_\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    361\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    362\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mfind_elements_by_id\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mid_\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\anaconda\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py\u001b[0m in \u001b[0;36mfind_element\u001b[1;34m(self, by, value)\u001b[0m\n\u001b[0;32m    974\u001b[0m                 \u001b[0mby\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mBy\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mCSS_SELECTOR\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    975\u001b[0m                 \u001b[0mvalue\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m'[name=\"%s\"]'\u001b[0m \u001b[1;33m%\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 976\u001b[1;33m         return self.execute(Command.FIND_ELEMENT, {\n\u001b[0m\u001b[0;32m    977\u001b[0m             \u001b[1;34m'using'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mby\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    978\u001b[0m             'value': value})['value']\n",
      "\u001b[1;32mD:\\anaconda\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py\u001b[0m in \u001b[0;36mexecute\u001b[1;34m(self, driver_command, params)\u001b[0m\n\u001b[0;32m    319\u001b[0m         \u001b[0mresponse\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcommand_executor\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdriver_command\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    320\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mresponse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 321\u001b[1;33m             \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0merror_handler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcheck_response\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mresponse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    322\u001b[0m             response['value'] = self._unwrap_value(\n\u001b[0;32m    323\u001b[0m                 response.get('value', None))\n",
      "\u001b[1;32mD:\\anaconda\\lib\\site-packages\\selenium\\webdriver\\remote\\errorhandler.py\u001b[0m in \u001b[0;36mcheck_response\u001b[1;34m(self, response)\u001b[0m\n\u001b[0;32m    239\u001b[0m             \u001b[1;32melif\u001b[0m \u001b[1;34m'alert'\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    240\u001b[0m                 \u001b[0malert_text\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'alert'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'text'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 241\u001b[1;33m             \u001b[1;32mraise\u001b[0m \u001b[0mexception_class\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmessage\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mscreen\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstacktrace\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0malert_text\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    242\u001b[0m         \u001b[1;32mraise\u001b[0m \u001b[0mexception_class\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmessage\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mscreen\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstacktrace\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    243\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mUnexpectedAlertPresentException\u001b[0m: Alert Text: 不能超过500个,若要重新选择,请按清除按钮,再进行选取操作\nMessage: unexpected alert open: {Alert text : 不能超过500个,若要重新选择,请按清除按钮,再进行选取操作}\n  (Session info: chrome=91.0.4472.106)\n"
     ]
    }
   ],
   "source": [
    "selectCheckbox (pages1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 导出前500篇refworks文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导出前500篇\n",
    "# 点击导出与分析\n",
    "element =driver.find_element_by_xpath('/html/body/div[3]/div[2]/div[2]/div[2]/form/div/div[1]/div[2]/ul[1]/li[2]/i')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击导出文献\n",
    "element =driver.find_element_by_xpath('/html/body/div[3]/div[2]/div[2]/div[2]/form/div/div[1]/div[2]/ul[1]/li[2]/ul/li[1]/a')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 选择Refworks\n",
    "element =driver.find_element_by_xpath('/html/body/div[3]/div[2]/div[2]/div[2]/form/div/div[1]/div[2]/ul[1]/li[2]/ul/li[1]/ul/li[8]/a')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['CDwindow-A52E504B4221D8947E9D3CE94C46D0D8',\n",
       " 'CDwindow-FBF8B320C59D71E366FAAC2B32887BD2',\n",
       " 'CDwindow-18FE5CBE3DD7A6852BEB88FD75A41CD1']"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## 查看所有窗口\n",
    "driver.window_handles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-56-24f0cf647c1b>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[2])\n"
     ]
    }
   ],
   "source": [
    "# 跳转到第三个页面\n",
    "driver.switch_to_window(driver.window_handles[2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击导出前500篇txt文件\n",
    "element =driver.find_element_by_xpath('/html/body/div[3]/div/div[2]/div[1]/ul/li[3]/a')\n",
    "element.get_attribute('innerHTML')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 批量下载前500篇文章"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击批量下载前500篇文章\n",
    "element = driver.find_element_by_xpath('/html/body/div[3]/div/div[2]/div[1]/ul/li[2]/a')\n",
    "element.get_attribute('innerHTML')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-59-75aca382f043>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[3])\n"
     ]
    }
   ],
   "source": [
    "# 跳转到第四个页面\n",
    "driver.switch_to_window(driver.window_handles[3])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击批量下载\n",
    "element = driver.find_element_by_xpath('/html/body/div[1]/div/div[1]/button')\n",
    "element.get_attribute('innerHTML')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 跳转页面"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-61-b24a4b09c9a8>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[1])\n"
     ]
    }
   ],
   "source": [
    "# 跳转到第二个页面\n",
    "driver.switch_to_window(driver.window_handles[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 回到11页   第一次循环的最后一页   \n",
    "element =driver.find_element_by_xpath('/html/body/div[3]/div[2]/div[2]/div[2]/form/div/div[2]/a[8]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 清除已选数据\n",
    "element =driver.find_element_by_xpath('/html/body/div[3]/div[2]/div[2]/div[2]/form/div/div[1]/div[2]/div[1]/a')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "def selectCheckbox (pages2):\n",
    "    for p in pages2:\n",
    "        print (p,end='\\t')\n",
    "        跳转 = driver.find_element_by_id('PageNext')\n",
    "        跳转.click()\n",
    "        time.sleep(45+20*random())\n",
    "        全选 = driver.find_element_by_xpath('/html/body/div[3]/div[2]/div[2]/div[2]/form/div/div[1]/div[2]/div[1]/label/input')\n",
    "        全选.click()\n",
    "        time.sleep(5+10*random())   "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {
    "collapsed": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10\t11\t12\t13\t14\t15\t16\t17\t18\t"
     ]
    },
    {
     "ename": "NoSuchElementException",
     "evalue": "Message: no such element: Unable to locate element: {\"method\":\"css selector\",\"selector\":\"[id=\"PageNext\"]\"}\n  (Session info: chrome=91.0.4472.106)\n",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mNoSuchElementException\u001b[0m                    Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-70-cd6c72a2cefd>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mselectCheckbox\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mpages2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;32m<ipython-input-69-e923ae31b5a5>\u001b[0m in \u001b[0;36mselectCheckbox\u001b[1;34m(pages2)\u001b[0m\n\u001b[0;32m      2\u001b[0m     \u001b[1;32mfor\u001b[0m \u001b[0mp\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mpages2\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      3\u001b[0m         \u001b[0mprint\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mp\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mend\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'\\t'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 4\u001b[1;33m         \u001b[0m跳转\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdriver\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind_element_by_id\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'PageNext'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      5\u001b[0m         \u001b[0m跳转\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclick\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      6\u001b[0m         \u001b[0mtime\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msleep\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m45\u001b[0m\u001b[1;33m+\u001b[0m\u001b[1;36m20\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0mrandom\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\anaconda\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py\u001b[0m in \u001b[0;36mfind_element_by_id\u001b[1;34m(self, id_)\u001b[0m\n\u001b[0;32m    358\u001b[0m             \u001b[0melement\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdriver\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind_element_by_id\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'foo'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    359\u001b[0m         \"\"\"\n\u001b[1;32m--> 360\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind_element\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mby\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mBy\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mID\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mid_\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    361\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    362\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mfind_elements_by_id\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mid_\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\anaconda\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py\u001b[0m in \u001b[0;36mfind_element\u001b[1;34m(self, by, value)\u001b[0m\n\u001b[0;32m    974\u001b[0m                 \u001b[0mby\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mBy\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mCSS_SELECTOR\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    975\u001b[0m                 \u001b[0mvalue\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m'[name=\"%s\"]'\u001b[0m \u001b[1;33m%\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 976\u001b[1;33m         return self.execute(Command.FIND_ELEMENT, {\n\u001b[0m\u001b[0;32m    977\u001b[0m             \u001b[1;34m'using'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mby\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    978\u001b[0m             'value': value})['value']\n",
      "\u001b[1;32mD:\\anaconda\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py\u001b[0m in \u001b[0;36mexecute\u001b[1;34m(self, driver_command, params)\u001b[0m\n\u001b[0;32m    319\u001b[0m         \u001b[0mresponse\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcommand_executor\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdriver_command\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    320\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mresponse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 321\u001b[1;33m             \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0merror_handler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcheck_response\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mresponse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    322\u001b[0m             response['value'] = self._unwrap_value(\n\u001b[0;32m    323\u001b[0m                 response.get('value', None))\n",
      "\u001b[1;32mD:\\anaconda\\lib\\site-packages\\selenium\\webdriver\\remote\\errorhandler.py\u001b[0m in \u001b[0;36mcheck_response\u001b[1;34m(self, response)\u001b[0m\n\u001b[0;32m    240\u001b[0m                 \u001b[0malert_text\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'alert'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'text'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    241\u001b[0m             \u001b[1;32mraise\u001b[0m \u001b[0mexception_class\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmessage\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mscreen\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstacktrace\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0malert_text\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 242\u001b[1;33m         \u001b[1;32mraise\u001b[0m \u001b[0mexception_class\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmessage\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mscreen\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstacktrace\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    243\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    244\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m_value_or_default\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mobj\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdefault\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mNoSuchElementException\u001b[0m: Message: no such element: Unable to locate element: {\"method\":\"css selector\",\"selector\":\"[id=\"PageNext\"]\"}\n  (Session info: chrome=91.0.4472.106)\n"
     ]
    }
   ],
   "source": [
    "selectCheckbox (pages2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 导出后414篇refworks文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导出与分析\n",
    "element =driver.find_element_by_xpath('/html/body/div[3]/div[2]/div[2]/div[2]/form/div/div[1]/div[2]/ul[1]/li[2]/i')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导出文献\n",
    "element =driver.find_element_by_xpath('/html/body/div[3]/div[2]/div[2]/div[2]/form/div/div[1]/div[2]/ul[1]/li[2]/ul/li[1]/a')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 选择Refworks\n",
    "element =driver.find_element_by_xpath('/html/body/div[3]/div[2]/div[2]/div[2]/form/div/div[1]/div[2]/ul[1]/li[2]/ul/li[1]/ul/li[8]/a')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-74-24f0cf647c1b>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[2])\n"
     ]
    }
   ],
   "source": [
    "# 跳转到第三个页面\n",
    "driver.switch_to_window(driver.window_handles[2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导出后500篇txt文件\n",
    "element =driver.find_element_by_xpath('/html/body/div[3]/div/div[2]/div[1]/ul/li[3]/a')\n",
    "element.get_attribute('innerHTML')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 批量下载后414篇文章"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 批量下载后414篇文章\n",
    "element = driver.find_element_by_xpath('/html/body/div[3]/div/div[2]/div[1]/ul/li[2]/a')\n",
    "element.get_attribute('innerHTML')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-77-75aca382f043>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[3])\n"
     ]
    }
   ],
   "source": [
    "# 跳转到第四个页面\n",
    "driver.switch_to_window(driver.window_handles[3])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击批量下载\n",
    "element = driver.find_element_by_xpath('/html/body/div[1]/div/div[1]/button')\n",
    "element.get_attribute('innerHTML')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 完成"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "248.532px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
